-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[RISCV] Expand multiplication by 2^N * 3/5/9 + 1 with SHL_ADD
#166933
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-risc-v Author: Piotr Fusik (pfusik) ChangesFull diff: https://github.com/llvm/llvm-project/pull/166933.diff 2 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index c1d38419992b1..637a46c508c1f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16544,12 +16544,23 @@ static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG,
break;
}
- // 2/4/8 * 3/5/9 + 1 -> (shXadd (shYadd X, X), X)
int ShX;
if (int ShY = isShifted359(MulAmt - 1, ShX)) {
assert(ShX != 0 && "MulAmt=4,6,10 handled before");
+ // 2/4/8 * 3/5/9 + 1 -> (shXadd (shYadd X, X), X)
if (ShX <= 3)
return getShlAddShlAdd(N, DAG, ShX, ShY, /*AddX=*/true, Shift);
+ // 2^N * 3/5/9 + 1 -> (add (shYadd (shl X, N), (shl X, N)), X)
+ if (Shift == 0) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ SDValue X = N->getOperand(0);
+ SDValue Shl =
+ DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShX, DL, VT));
+ SDValue ShlAdd = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl,
+ DAG.getTargetConstant(ShY, DL, VT), Shl);
+ return DAG.getNode(ISD::ADD, DL, VT, ShlAdd, X);
+ }
}
return SDValue();
}
@@ -16610,7 +16621,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
DAG.getTargetConstant(Shift, DL, VT), Shift1);
}
- // TODO: 2^(C1>3) * 3,5,9 +/- 1
+ // TODO: 2^(C1>3) * 3/5/9 - 1
// 2^n + 2/4/8 + 1 -> (add (shl X, C1), (shXadd X, X))
if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) {
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index e56c7b41d43ce..156599fb72877 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -944,6 +944,58 @@ define i64 @addmul146(i64 %a, i64 %b) {
ret i64 %d
}
+define i64 @mul49(i64 %a) {
+; RV64I-LABEL: mul49:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a1, 49
+; RV64I-NEXT: mul a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBA-LABEL: mul49:
+; RV64ZBA: # %bb.0:
+; RV64ZBA-NEXT: slli a1, a0, 4
+; RV64ZBA-NEXT: sh1add a1, a1, a1
+; RV64ZBA-NEXT: add a0, a1, a0
+; RV64ZBA-NEXT: ret
+;
+; RV64XANDESPERF-LABEL: mul49:
+; RV64XANDESPERF: # %bb.0:
+; RV64XANDESPERF-NEXT: slli a1, a0, 4
+; RV64XANDESPERF-NEXT: nds.lea.h a1, a1, a1
+; RV64XANDESPERF-NEXT: add a0, a1, a0
+; RV64XANDESPERF-NEXT: ret
+ %c = mul i64 %a, 49
+ ret i64 %c
+}
+
+define i64 @zext_mul49(i32 signext %a) {
+; RV64I-LABEL: zext_mul49:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a1, 49
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: mulhu a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBA-LABEL: zext_mul49:
+; RV64ZBA: # %bb.0:
+; RV64ZBA-NEXT: slli.uw a1, a0, 4
+; RV64ZBA-NEXT: sh1add a1, a1, a1
+; RV64ZBA-NEXT: add.uw a0, a0, a1
+; RV64ZBA-NEXT: ret
+;
+; RV64XANDESPERF-LABEL: zext_mul49:
+; RV64XANDESPERF: # %bb.0:
+; RV64XANDESPERF-NEXT: slli a1, a0, 32
+; RV64XANDESPERF-NEXT: srli a1, a1, 28
+; RV64XANDESPERF-NEXT: nds.lea.h a1, a1, a1
+; RV64XANDESPERF-NEXT: nds.lea.b.ze a0, a1, a0
+; RV64XANDESPERF-NEXT: ret
+ %b = zext i32 %a to i64
+ %c = mul i64 %b, 49
+ ret i64 %c
+}
+
define i64 @mul50(i64 %a) {
; RV64I-LABEL: mul50:
; RV64I: # %bb.0:
@@ -1044,6 +1096,54 @@ define i64 @addmul100(i64 %a, i64 %b) {
ret i64 %d
}
+define i64 @mul145(i64 %a) {
+; RV64I-LABEL: mul145:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a1, 145
+; RV64I-NEXT: mul a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBA-LABEL: mul145:
+; RV64ZBA: # %bb.0:
+; RV64ZBA-NEXT: slli a1, a0, 4
+; RV64ZBA-NEXT: sh3add a1, a1, a1
+; RV64ZBA-NEXT: add a0, a1, a0
+; RV64ZBA-NEXT: ret
+;
+; RV64XANDESPERF-LABEL: mul145:
+; RV64XANDESPERF: # %bb.0:
+; RV64XANDESPERF-NEXT: slli a1, a0, 4
+; RV64XANDESPERF-NEXT: nds.lea.d a1, a1, a1
+; RV64XANDESPERF-NEXT: add a0, a1, a0
+; RV64XANDESPERF-NEXT: ret
+ %c = mul i64 %a, 145
+ ret i64 %c
+}
+
+define i64 @mul161(i64 %a) {
+; RV64I-LABEL: mul161:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a1, 161
+; RV64I-NEXT: mul a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBA-LABEL: mul161:
+; RV64ZBA: # %bb.0:
+; RV64ZBA-NEXT: slli a1, a0, 5
+; RV64ZBA-NEXT: sh2add a1, a1, a1
+; RV64ZBA-NEXT: add a0, a1, a0
+; RV64ZBA-NEXT: ret
+;
+; RV64XANDESPERF-LABEL: mul161:
+; RV64XANDESPERF: # %bb.0:
+; RV64XANDESPERF-NEXT: slli a1, a0, 5
+; RV64XANDESPERF-NEXT: nds.lea.w a1, a1, a1
+; RV64XANDESPERF-NEXT: add a0, a1, a0
+; RV64XANDESPERF-NEXT: ret
+ %c = mul i64 %a, 161
+ ret i64 %c
+}
+
define i64 @mul162(i64 %a) {
; RV64I-LABEL: mul162:
; RV64I: # %bb.0:
|
| ; RV64ZBA: # %bb.0: | ||
| ; RV64ZBA-NEXT: slli a1, a0, 4 | ||
| ; RV64ZBA-NEXT: sh1add a1, a1, a1 | ||
| ; RV64ZBA-NEXT: add a0, a1, a0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If my CPU has a 2 cycle multiply latency like SiFiveP400/600/800, this seems worse. Should I consider adding a tuning flag to disable this for my CPUs?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This PR expands mul into 3 instructions, with a possible fold of zext into slli.uw.
Most of the transforms in expandMul expand into three instructions, so I assumed it's okay.
If not (for reasons other than hasMinSize() which is handled), I think we should handle that together with the other 3-instruction expansions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, I agree with that.
Some of the transforms have ILP between 2 of the 3 instructions. This lowering is serial. There are probably other serial lowerings. On a core with more shift/add resources than multiply, a serial implementation this could still be beneficial if it prevents the multiplier from being a bottleneck.
On a single issue in order CPU with a pipelined multiplier, this may just be increasing the code size for no benefit. Freeing up the multiplier doesn't help if you can't issue any more instructions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are probably other serial lowerings.
- 3/5/9 * 3/5/9 * 2^N
- (2/4/8 * 3/5/9 + 1) * 2^N
topperc
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
No description provided.